Summary

PCA for wecare-only (b37) wes dataset w/o Danish:

  • Detects the same outliers as the previous analyses: P6_D05 and P5_E09
  • Shows no irregularities in PCA analysis
  • Provides evidence for using top 2 PC-s in regression model

References:

Start section

# Time
Sys.time()
## [1] "2021-01-22 23:49:59 GMT"
# Memory
gc()
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 411912 22.0     856132 45.8         NA   641517 34.3
## Vcells 789331  6.1    8388608 64.0      16384  1768563 13.5
# Clean up
rm(list=ls())
graphics.off()

# Options
options(stringsAsFactors = F)

# Working folders
base_folder <- "/Users/alexey/Documents" # mac
project_folder <- file.path(base_folder,"wecare","final_analysis_2021","reanalysis_wo_danish_2021") # mac

scripts_folder <- file.path(project_folder,"scripts","s08_pca")
setwd(scripts_folder)

data_folder <- file.path(project_folder,"data","s08_pca")

library(bigsnpr) # for bed_autoSVD() and bed()
## Loading required package: bigstatsr
library(bigutilsr) # for prob_dist() and tukey_mc_up() for outlier detection
library(hexbin) # for plotting svd loadings
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
NCORES <- 1
#NCORES <- nb_cores() # 2

Read bed-bim-fam file-set

# Location of bed file
bed_file <- file.path(data_folder,"s03_non_related","common_hwe_norel.bed")

# Attach PLINK data to R environment
wecare.bed <- bed(bed_file) # bigsnpr::bed

# Explore wecare.bed
wecare.bed
## A 'bed' object with 335 samples and 39628 variants.
attributes(wecare.bed)
## $.xData
## <environment: 0x7f8ec82a9668>
## 
## $class
## [1] "bed"
## attr(,"package")
## [1] "bigsnpr"
str(wecare.bed)
## Reference class 'bed' [package "bigsnpr"] with 13 fields
##  $ bedfile: chr "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel.bed"
##  $ extptr :<externalptr> 
##  $ .fam   :'data.frame': 335 obs. of  6 variables:
##   ..$ family.ID  : chr [1:335] "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
##   ..$ sample.ID  : chr [1:335] "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
##   ..$ paternal.ID: int [1:335] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ maternal.ID: int [1:335] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ sex        : int [1:335] 2 2 2 2 2 2 2 2 2 2 ...
##   ..$ affection  : int [1:335] 2 2 2 2 2 2 1 2 2 1 ...
##  $ .map   :'data.frame': 39628 obs. of  6 variables:
##   ..$ chromosome  : int [1:39628] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ marker.ID   : chr [1:39628] "1_881627_G_A" "1_897325_G_C" "1_897564_T_C" "1_897738_C_T" ...
##   ..$ genetic.dist: int [1:39628] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ physical.pos: int [1:39628] 881627 897325 897564 897738 911916 982941 982994 986443 987200 990380 ...
##   ..$ allele1     : chr [1:39628] "G" "G" "T" "T" ...
##   ..$ allele2     : chr [1:39628] "A" "C" "C" "C" ...
##  $ prefix : chr "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel"
##  $ bimfile: chr "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel.bim"
##  $ famfile: chr "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel.fam"
##  $ fam    :'data.frame': 335 obs. of  6 variables:
##   ..$ family.ID  : chr [1:335] "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
##   ..$ sample.ID  : chr [1:335] "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
##   ..$ paternal.ID: int [1:335] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ maternal.ID: int [1:335] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ sex        : int [1:335] 2 2 2 2 2 2 2 2 2 2 ...
##   ..$ affection  : int [1:335] 2 2 2 2 2 2 1 2 2 1 ...
##  $ map    :'data.frame': 39628 obs. of  6 variables:
##   ..$ chromosome  : int [1:39628] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ marker.ID   : chr [1:39628] "1_881627_G_A" "1_897325_G_C" "1_897564_T_C" "1_897738_C_T" ...
##   ..$ genetic.dist: int [1:39628] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ physical.pos: int [1:39628] 881627 897325 897564 897738 911916 982941 982994 986443 987200 990380 ...
##   ..$ allele1     : chr [1:39628] "G" "G" "T" "T" ...
##   ..$ allele2     : chr [1:39628] "A" "C" "C" "C" ...
##  $ nrow   : int 335
##  $ ncol   : int 39628
##  $ light  :Reference class 'bed_light' [package "bigsnpr"] with 5 fields
##   ..$ bedfile: chr "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel.bed"
##   ..$ nrow   : int 335
##   ..$ ncol   : int 39628
##   ..$ extptr :<externalptr> 
##   ..$ address:<externalptr> 
##   ..and 15 methods, of which 1 is  possibly relevant:
##   ..  initialize
##  $ address:<externalptr> 
##  and 16 methods, of which 2 are  possibly relevant:
##    initialize, show#envRefClass
wecare.bed$bedfile
## [1] "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel.bed"
wecare.bed$address
## <pointer: 0x7f8ec148a6c0>
# Clean-up
rm(bed_file)

Phenotypes

wecare_fam.df <- wecare.bed$fam
dim(wecare_fam.df)
## [1] 335   6
head(wecare_fam.df)
##   family.ID sample.ID paternal.ID maternal.ID sex affection
## 1    P1_A01    P1_A01           0           0   2         2
## 2    P1_A02    P1_A02           0           0   2         2
## 3    P1_A03    P1_A03           0           0   2         2
## 4    P1_A04    P1_A04           0           0   2         2
## 5    P1_A05    P1_A05           0           0   2         2
## 6    P1_A06    P1_A06           0           0   2         2
# Phenotypes from external file
load(file.path(project_folder,"data","s06_filter","s04_filter_by_sample_call_rates.RData"))
rm(genotypes.mx,variants.df)

project_folder <- file.path(base_folder,"wecare","final_analysis_2021","reanalysis_wo_danish_2021") # mac
scripts_folder <- file.path(project_folder,"scripts","s08_pca")
data_folder <- file.path(project_folder,"data","s08_pca")

dim(phenotypes.df)
## [1] 335  40
colnames(phenotypes.df)
##  [1] "wes_id"         "gwas_id"        "merged_id"      "filter"        
##  [5] "cc"             "setno"          "registry"       "family_history"
##  [9] "age_dx"         "age_ref"        "rstime"         "eig1_gwas"     
## [13] "eig2_gwas"      "eig3_gwas"      "eig4_gwas"      "eig5_gwas"     
## [17] "stage"          "er"             "pr"             "hist_cat"      
## [21] "hormone"        "chemo_cat"      "br_xray"        "br_xray_dose"  
## [25] "age_menarche"   "age_1st_ftp"    "age_menopause"  "num_preg"      
## [29] "bmi_age18"      "bmi_dx"         "bmi_ref"        "eig1_wecare"   
## [33] "eig2_wecare"    "eig3_wecare"    "eig4_wecare"    "eig5_wecare"   
## [37] "Batch_ID"       "Subject_ID"     "Barcode"        "danish"
# Merge fam-file and phenotypes from external file 
wecare_phenotypes.df <- left_join(wecare_fam.df, phenotypes.df,
                                  by=c("sample.ID"="wes_id"))
dim(wecare_phenotypes.df)
## [1] 335  45
colnames(wecare_phenotypes.df)
##  [1] "family.ID"      "sample.ID"      "paternal.ID"    "maternal.ID"   
##  [5] "sex"            "affection"      "gwas_id"        "merged_id"     
##  [9] "filter"         "cc"             "setno"          "registry"      
## [13] "family_history" "age_dx"         "age_ref"        "rstime"        
## [17] "eig1_gwas"      "eig2_gwas"      "eig3_gwas"      "eig4_gwas"     
## [21] "eig5_gwas"      "stage"          "er"             "pr"            
## [25] "hist_cat"       "hormone"        "chemo_cat"      "br_xray"       
## [29] "br_xray_dose"   "age_menarche"   "age_1st_ftp"    "age_menopause" 
## [33] "num_preg"       "bmi_age18"      "bmi_dx"         "bmi_ref"       
## [37] "eig1_wecare"    "eig2_wecare"    "eig3_wecare"    "eig4_wecare"   
## [41] "eig5_wecare"    "Batch_ID"       "Subject_ID"     "Barcode"       
## [45] "danish"
# Make sure that dplyr::left_joint hasnt changed the order of samples
sum(substr(wecare_phenotypes.df$merged_id,1,6) != wecare_fam.df$sample.ID)
## [1] 0
# Add column fopr outliers
wecare_phenotypes.df <- data.frame(wecare_phenotypes.df,outlier=F)

# Clean-up
rm(phenotypes.df, wecare_fam.df)

Variants

# map file
wecare_map.df <- wecare.bed$map
dim(wecare_map.df)
## [1] 39628     6
head(wecare_map.df)
##   chromosome    marker.ID genetic.dist physical.pos allele1 allele2
## 1          1 1_881627_G_A            0       881627       G       A
## 2          1 1_897325_G_C            0       897325       G       C
## 3          1 1_897564_T_C            0       897564       T       C
## 4          1 1_897738_C_T            0       897738       T       C
## 5          1 1_911916_C_T            0       911916       T       C
## 6          1 1_982941_T_C            0       982941       T       C
# make simple counts 
wecare_maf.df <- bed_MAF(wecare.bed)
dim(wecare_maf.df)
## [1] 39628     5
head(wecare_maf.df)
##    ac mac         af        maf   N
## 1 196 196 0.33793103 0.33793103 290
## 2  42  42 0.07266436 0.07266436 289
## 3  43  43 0.06847134 0.06847134 314
## 4  41  41 0.07093426 0.07093426 289
## 5  85  85 0.12724551 0.12724551 334
## 6  61  61 0.09327217 0.09327217 327
# merge map file with the counts
wecare_variants.df <- cbind(wecare_map.df,wecare_maf.df)
dim(wecare_variants.df)
## [1] 39628    11
head(wecare_variants.df)
##   chromosome    marker.ID genetic.dist physical.pos allele1 allele2  ac mac
## 1          1 1_881627_G_A            0       881627       G       A 196 196
## 2          1 1_897325_G_C            0       897325       G       C  42  42
## 3          1 1_897564_T_C            0       897564       T       C  43  43
## 4          1 1_897738_C_T            0       897738       T       C  41  41
## 5          1 1_911916_C_T            0       911916       T       C  85  85
## 6          1 1_982941_T_C            0       982941       T       C  61  61
##           af        maf   N
## 1 0.33793103 0.33793103 290
## 2 0.07266436 0.07266436 289
## 3 0.06847134 0.06847134 314
## 4 0.07093426 0.07093426 289
## 5 0.12724551 0.12724551 334
## 6 0.09327217 0.09327217 327
# Variants with AF(ref) < AF(alt)
sum(wecare_variants.df$ac != wecare_variants.df$mac)
## [1] 0
# Clean-up
rm(wecare_map.df, wecare_maf.df)

1st round of PCA

Takes care about LD etc.
See ?plot.big_SVD for plotting svd objets.

# Get indices of non-outliers in format required by bed_autoSVD
# (integer indices, indicating row numbers)
non_outliers1 <- which(!wecare_phenotypes.df$outlier)
length(non_outliers1)
## [1] 335
# bigsnpr::bed_autoSVD, Default k = 10
#using all samples (ind.row) and variants (ind.col) 
#table(wecare.bed$map$chromosome) - if complains abotut non-numeric chromosomes
wecare.svd1 <- bed_autoSVD(wecare.bed, 
                          ind.row=non_outliers1, 
                          ncores = NCORES) 
## 
## Phase of clumping (on MAC) at r^2 > 0.2.. keep 17837 variants.
## Discarding 0 variant with MAC < 10.
## 
## Iteration 1:
## Computing SVD..
## 128 outlier variants detected..
## 1 long-range LD region detected..
## 
## Iteration 2:
## Computing SVD..
## 0 outlier variant detected..
## 
## Converged!
#ind.col=vars_not_in_LD, 

# Variants not in LD (detected by clumping during autoSVD)
vars_not_in_LD1 <- attr(wecare.svd1, "subset")
length(vars_not_in_LD1)
## [1] 17709
#attributes(wecare.svd)
str(wecare.svd1)
## List of 7
##  $ d     : num [1:10] 196 183 152 152 151 ...
##  $ u     : num [1:335, 1:10] 0.1363 0.1451 -0.0121 -0.0211 -0.0259 ...
##  $ v     : num [1:17709, 1:10] 0.000921 -0.005054 0.004853 0.00721 -0.009297 ...
##  $ niter : num 15
##  $ nops  : num 244
##  $ center: num [1:17709] 0.676 0.137 0.254 0.211 0.511 ...
##  $ scale : num [1:17709] 0.669 0.357 0.471 0.435 0.617 ...
##  - attr(*, "class")= chr "big_SVD"
##  - attr(*, "subset")= int [1:17709] 1 3 5 9 10 11 12 15 16 20 ...
##  - attr(*, "lrldr")='data.frame':    1 obs. of  3 variables:
##   ..$ Chr  : num 6
##   ..$ Start: num 32796751
##   ..$ Stop : num 42650772
# Eigenvalues
length(wecare.svd1$d)
## [1] 10
wecare.svd1$d
##  [1] 195.8245 182.9719 151.7824 151.5087 151.1509 150.7539 150.6255 150.5100
##  [9] 150.1112 149.5871
plot(wecare.svd1) # default type="screeplot" see ?plot.big_SVD  
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?

# Eigenvectors
dim(wecare.svd1$u)
## [1] 335  10
head(wecare.svd1$u)
##             [,1]         [,2]         [,3]         [,4]         [,5]
## [1,]  0.13630917  0.105728563 -0.058869640  0.053270057 -0.071375682
## [2,]  0.14509744  0.060019838  0.009664619 -0.036790376  0.004449234
## [3,] -0.01212106 -0.003851353 -0.028351214  0.034059416  0.084183131
## [4,] -0.02109911  0.018570166  0.071605066 -0.005023272  0.002584699
## [5,] -0.02591967 -0.028676699  0.072597633  0.010689744  0.060390468
## [6,] -0.01649853 -0.012722733  0.023779948  0.053496518 -0.081919467
##              [,6]        [,7]         [,8]         [,9]        [,10]
## [1,] -0.083141571  0.09325142 -0.054764636  0.029491944  0.028048716
## [2,]  0.004385738 -0.02111137  0.008643476 -0.004200611  0.063309640
## [3,] -0.068367578  0.01421584  0.008810475  0.001191556 -0.031455299
## [4,]  0.030187082 -0.05529577  0.003396932  0.070203572  0.032446894
## [5,] -0.022477192  0.07267845  0.064590228 -0.001438803  0.003509294
## [6,]  0.008744997  0.03610154 -0.024077602  0.141481805 -0.053570862
# PCA summary (for PCs from 1 to 10)
plot(wecare.svd1,type = "scores",scores=1:10,coeff=0.4)

# Loadings
dim(wecare.svd1$v)
## [1] 17709    10
head(wecare.svd1$v)
##              [,1]          [,2]          [,3]         [,4]         [,5]
## [1,]  0.000920977 -0.0008277551  0.0005237386 -0.008730989  0.009041437
## [2,] -0.005053747  0.0044461637  0.0027742776 -0.010116289 -0.002725499
## [3,]  0.004853187  0.0028474205 -0.0051168462 -0.012379625  0.013007407
## [4,]  0.007210244  0.0021103606 -0.0011785656  0.006364635  0.009475204
## [5,] -0.009296606  0.0052304251  0.0013548087  0.001468288 -0.007015240
## [6,]  0.009219696 -0.0192255697 -0.0082900970  0.014792149  0.002904332
##              [,6]         [,7]         [,8]          [,9]         [,10]
## [1,]  0.005177317  0.006716877 -0.014870474 -0.0077179427  0.0077299110
## [2,] -0.005139297 -0.006675946 -0.002947939  0.0030950364  0.0003375347
## [3,]  0.010588261  0.004296047 -0.014231075 -0.0146200084 -0.0028576898
## [4,]  0.006088027  0.006850451  0.001843243  0.0025879821 -0.0043882490
## [5,] -0.002124761 -0.012870763 -0.002623696 -0.0000418261 -0.0001034652
## [6,] -0.008097585 -0.012913949  0.017080498  0.0004770108 -0.0125200209
# Loadings summary (for PCs from 1 to 10)
plot(wecare.svd1,type="loadings",loadings=1:10,coeff=0.4)

# Calculate a measure of "outlieness"  
U1 <- wecare.svd1$u
prob1 <- prob_dist(U1, ncores=NCORES) # bigutilsr::prob_dist
S1 <- prob1$dist.self / sqrt(prob1$dist.nn)
tukey_threshold1 <- tukey_mc_up(S1) # bigutilsr::tukey_mc_up

# Outliers
outliers1 <- S1 >= tukey_threshold1
sum(outliers1)
## [1] 1
outliers_id1 <- wecare.bed$fam$sample.ID[S1 >= tukey_threshold1]
outliers_id1
## [1] "P6_D05"
# Histogram by "outlieness" score
ggplot() +
  geom_histogram(aes(S1), color = "black", fill = "blue", alpha = 0.3) +
  theme_bigstatsr() +
  geom_vline(xintercept=tukey_threshold1, colour="red") +
  labs(x = "Statistic of outlierness (S)", y = "Frequency (sqrt-scale)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Location of outlier(s) in PCA plots
plot(U1[, 1:2], col = (S1 > tukey_threshold1) + 1, pch = 20)

plot(U1[, 3:4], col = (S1 > tukey_threshold1) + 1, pch = 20)

plot(U1[, 5:6], col = (S1 > tukey_threshold1) + 1, pch = 20)

# Add outlier to the phenotypes data frame
wecare_phenotypes.df$outlier <- wecare_phenotypes.df$outlier | 
  wecare_phenotypes.df$sample.ID %in% outliers_id1

sum(wecare_phenotypes.df$outlier)
## [1] 1
# Clean-up
rm(non_outliers1,U1,prob1,S1,tukey_threshold1,outliers_id1,
   outliers1,wecare.svd1) #vars_not_in_LD1

2nd round of PCA

Re-run w/o previously detected outlier(s)
Using previously calculated vars_not_in_LD to speed-up

https://privefl.github.io/bigsnpr/articles/bedpca.html

# Get indices of non-outliers in format required by bed_autoSVD
# (integer indices, indicating row numbers)
non_outliers2 <- which(!wecare_phenotypes.df$outlier)
length(non_outliers2)
## [1] 334
# Calculate PCA
wecare.svd2 <- bed_autoSVD(wecare.bed, 
                          ind.row=non_outliers2, 
                          ind.col=vars_not_in_LD1,
                          ncores = NCORES) 
## 
## Phase of clumping (on MAC) at r^2 > 0.2.. keep 17677 variants.
## Discarding 0 variant with MAC < 10.
## 
## Iteration 1:
## Computing SVD..
## 0 outlier variant detected..
## 
## Converged!
# ind.col=vars_not_in_LD1 - removes the outlier 

# Variants not in LD (detected by clumping during autoSVD)
#vars_not_in_LD2 <- attr(wecare.svd2, "subset")
#ength(vars_not_in_LD2)

# Explore PCA results
plot(wecare.svd2)
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?

plot(wecare.svd2, type = "loadings", loadings=1:10, coeff=0.4)

plot(wecare.svd2,type = "scores",scores=1:10,coeff=0.4)

# Calculate a measure of "outlieness"  
U2 <- wecare.svd2$u
prob2 <- prob_dist(U2, ncores=NCORES) # bigutilsr::prob_dist
S2 <- prob2$dist.self / sqrt(prob2$dist.nn)
tukey_threshold2 <- tukey_mc_up(S2) # bigutilsr::tukey_mc_up

# Outliers
outliers2 <- S2 >= tukey_threshold2
sum(outliers2)
## [1] 1
outliers_id2 <- wecare.bed$fam$sample.ID[S2 >= tukey_threshold2]
outliers_id2
## [1] "P5_E09"
# Histogram by "outlieness" score
ggplot() +
  geom_histogram(aes(S2), color = "black", fill = "blue", alpha = 0.3) +
  theme_bigstatsr() +
  geom_vline(xintercept=tukey_threshold2, colour="red") +
  labs(x = "Statistic of outlierness (S)", y = "Frequency (sqrt-scale)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Location of outlier(s) in PCA plots
plot(U2[, 1:2], col = (S2 > tukey_threshold2) + 1, pch = 20)

plot(U2[, 3:4], col = (S2 > tukey_threshold2) + 1, pch = 20)

plot(U2[, 5:6], col = (S2 > tukey_threshold2) + 1, pch = 20)

# Add outlier to the phenotypes data frame
wecare_phenotypes.df$outlier <- wecare_phenotypes.df$outlier | 
  wecare_phenotypes.df$sample.ID %in% outliers_id2

sum(wecare_phenotypes.df$outlier)
## [1] 2
# Clean-up
rm(non_outliers2,U2,prob2,S2,tukey_threshold2,outliers_id2,
   outliers2,wecare.svd2) # vars_not_in_LD2

3rd round of PCA

Re-run w/o previously detected outlier(s)
Using previously calculated vars_not_in_LD to speed-up

https://privefl.github.io/bigsnpr/articles/bedpca.html

# Get indices of non-outliers in format required by bed_autoSVD
# (integer indices, indicating row numbers)
non_outliers3 <- which(!wecare_phenotypes.df$outlier)
length(non_outliers3)
## [1] 333
# Calculate PCA
wecare.svd3 <- bed_autoSVD(wecare.bed, 
                          ind.row=non_outliers3, 
                          ind.col=vars_not_in_LD1, 
                          ncores = NCORES) 
## 
## Phase of clumping (on MAC) at r^2 > 0.2.. keep 17653 variants.
## Discarding 0 variant with MAC < 10.
## 
## Iteration 1:
## Computing SVD..
## 0 outlier variant detected..
## 
## Converged!
# Explore PCA result
plot(wecare.svd3)
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?

plot(wecare.svd3, type = "loadings", loadings=1:10, coeff=0.4)

plot(wecare.svd3,type = "scores",scores=1:10,coeff=0.4)

# Calculate a measure of "outlieness"  
U3 <- wecare.svd3$u
prob3 <- prob_dist(U3, ncores=NCORES) # bigutilsr::prob_dist
S3 <- prob3$dist.self / sqrt(prob3$dist.nn)
tukey_threshold3 <- tukey_mc_up(S3) # bigutilsr::tukey_mc_up

# Outliers
outliers3 <- S3 >= tukey_threshold3
sum(outliers3)
## [1] 0
# Histogram by "outlieness" score
ggplot() +
  geom_histogram(aes(S3), color = "black", fill = "blue", alpha = 0.3) +
  theme_bigstatsr() +
  geom_vline(xintercept=tukey_threshold3, colour="red") +
  labs(x = "Statistic of outlierness (S)", y = "Frequency (sqrt-scale)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Clean-up
# Clean-up
rm(non_outliers3,U3,prob3,S3,tukey_threshold3,outliers3) # vars_not_in_LD2

Update phenotypes table

updated_phenotypes.df <- wecare_phenotypes.df[!wecare_phenotypes.df$outlier,]
dim(updated_phenotypes.df)
## [1] 333  46
eigenvectors.mx <- wecare.svd3$u
dim(eigenvectors.mx)
## [1] 333  10
colnames(eigenvectors.mx) <- 
  c("pc1","pc2","pc3","pc4","pc5","pc6","pc7","pc8","pc9","pc10")

updated_phenotypes.df <- cbind(updated_phenotypes.df, eigenvectors.mx)
dim(updated_phenotypes.df)
## [1] 333  56
colnames(updated_phenotypes.df)
##  [1] "family.ID"      "sample.ID"      "paternal.ID"    "maternal.ID"   
##  [5] "sex"            "affection"      "gwas_id"        "merged_id"     
##  [9] "filter"         "cc"             "setno"          "registry"      
## [13] "family_history" "age_dx"         "age_ref"        "rstime"        
## [17] "eig1_gwas"      "eig2_gwas"      "eig3_gwas"      "eig4_gwas"     
## [21] "eig5_gwas"      "stage"          "er"             "pr"            
## [25] "hist_cat"       "hormone"        "chemo_cat"      "br_xray"       
## [29] "br_xray_dose"   "age_menarche"   "age_1st_ftp"    "age_menopause" 
## [33] "num_preg"       "bmi_age18"      "bmi_dx"         "bmi_ref"       
## [37] "eig1_wecare"    "eig2_wecare"    "eig3_wecare"    "eig4_wecare"   
## [41] "eig5_wecare"    "Batch_ID"       "Subject_ID"     "Barcode"       
## [45] "danish"         "outlier"        "pc1"            "pc2"           
## [49] "pc3"            "pc4"            "pc5"            "pc6"           
## [53] "pc7"            "pc8"            "pc9"            "pc10"
# Check consistency with the previous eigenvectors from WES
plot(updated_phenotypes.df$eig1_wecare,
     updated_phenotypes.df$pc1,main="PC1: new vs old WES")

plot(updated_phenotypes.df$eig2_wecare,
     updated_phenotypes.df$pc2,main="PC2: new vs old WES")

# Check consistency with the previous eigenvectors from GWAS
plot(updated_phenotypes.df$eig1_gwas,
     updated_phenotypes.df$pc1,main="PC1: new WES vs GWAs")

plot(updated_phenotypes.df$eig2_gwas,
     updated_phenotypes.df$pc2,main="PC2: new WES vs GWAs")

# Clean-up
rm(wecare_phenotypes.df, eigenvectors.mx, vars_not_in_LD1)
#somehow close wecare.bed ?

Detailed PCA plots

plot(wecare.svd3, type = "scores") +
     aes(color = updated_phenotypes.df$affection == 2) +
     labs(title = NULL, color = "Case")

plot(wecare.svd3, type = "scores", scores=3:4) +
     aes(color = updated_phenotypes.df$affection == 2) +
     labs(title = NULL, color = "Case")

plot(wecare.svd3, type = "scores", scores=5:6) +
     aes(color = updated_phenotypes.df$affection == 2) +
     labs(title = NULL, color = "Case")

plot(wecare.svd3, type = "scores", scores=7:8) +
     aes(color = updated_phenotypes.df$affection == 2) +
     labs(title = NULL, color = "Case")

plot(wecare.svd3, type = "scores", scores=9:10) +
     aes(color = updated_phenotypes.df$affection == 2) +
     labs(title = NULL, color = "Case")

Save results

save.image(file.path(data_folder,"s04_calculate_PCs.RData"))
save(updated_phenotypes.df,file=file.path(data_folder,"s04_phenotypes_with_PCs.RData"))

End section

ls()
## [1] "base_folder"           "data_folder"           "NCORES"               
## [4] "project_folder"        "scripts_folder"        "updated_phenotypes.df"
## [7] "wecare_variants.df"    "wecare.bed"            "wecare.svd3"
Sys.time()
## [1] "2021-01-22 23:50:53 GMT"
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb)  max used  (Mb)
## Ncells 2157082 115.3    5738170 306.5         NA   7172712 383.1
## Vcells 6509215  49.7   52484708 400.5      16384 128129254 977.6